import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
from time import sleep
import random
import citycode

# 根据关键词和页数构建网页url
def get_url(key,n):
	'''
	【分页网址url采集】函数
	n：页数参数
	结果：得到一个分页网页的list
	'''
	lst = []
	for i in range(n):
		ui = str(key)+"-jingdian-1-{}".format(i + 1)
		lst.append(ui)
	return lst

# 爬取网页，解析返回报文提取信息
def get_data(ui, d_h, d_c, keyword):
	'''
	【数据采集】
	ui：数据信息网页
	d_h：user-agent信息
	d_c：cookies信息
	结果：得到数据的list，每条数据用dict存储
	'''
	ri = requests.get(ui, headers=dic_heders, cookies=dic_cookies)
	sleep(random.uniform(1, 2)) # 停滞1-2秒，模拟用户操作，防止被反爬
	soup_i = BeautifulSoup(ri.text, 'lxml')
	ul = soup_i.find("ul", class_="list_item clrfix")
	lis = ul.find_all('li')

	lst = []
	for li in lis:
		dic = {}
		# 具体对每一项的提取格式
		dic['景点名称'] = li.find('span', class_="cn_tit").text
		dic['景点关键词'] = keyword
		dic['景点图片'] = li.find('a',class_="imglink").img['src']
		dic['攻略数量'] = li.find('div', class_="strategy_sum").text
		dic['评分'] = li.find('span', class_="total_star").span['style']
		dic['简介'] = li.find('div', class_="desbox").text
		dic['排名'] = li.find('span', class_="ranking_sum").text
		dic['经度'] = li['data-lng']
		dic['纬度'] = li['data-lat']
		dic['点评数量'] = li.find('div', class_="comment_sum").text
		dic['多少驴友来过'] = li.find('span', class_="comment_sum").span.text
		lst.append(dic)
	return lst


if __name__ == "__main__":
	# cookies信息配置
	dic_cookies = {}
	cookies = '''QN1=dXrgj14+tmYQhFxKE9ekAg==; QN205=organic; QN277=organic; QN269=506F28C14A7611EAA0BEFA163E244083; 
				_i=RBTKSRDqFhTQT5KRlx-P1H78agxx; fid=7cc3c3d9-3f6c-45e1-8cef-3384cd5da577; Hm_lvt_c56a2b5278263aa647778d304009eafc=1581168271,1581220912; 
				viewpoi=7564992|709275; viewdist=299878-7; uld=1-299878-8-1581221233|1-1062172-1-1581168529; QN267=1679639433d5aedfc8; 
				Hm_lpvt_c56a2b5278263aa647778d304009eafc=1581221236; QN25=cb06bfbd-d687-4072-98c5-73266b637a6a-9f992f90; QN42=nvxp8441; 
				_q=U.qunar_lbs_428305502; _t=26463150; csrfToken=oXYBnhSoGAGxggRkzmAjbxxGrpgsjUqQ; _s=s_ZBWFJO3EEGZISWS35EBIS5NQYA; 
				_v=YTRjW_H5L47nGNVabvTLt1mlh7j8R7t4UNDVRrJUz0wScfLMWgSvkwQbzMLHlFbsvTU-2kJrBK74NUyOi3MX_3obY94Hhhugt8bv8ILxwsWDv4s_ANNiM8qRdg6HlBrrCEnGYr8lxS9uv78zDCNKz9pFbN8JPYy-AKJP6xILIsT7; 
				_vi=4ONQzvfOOhwJECN5R-4rfWZDzlQ5-qv2xi_jsp1INPEpy9iKHa5gV0gHc35fDfTDe3TjcKteU7ZWk1vd6MsIqTfXYyUh3gTwZJ_9z3PEpkXZReeeIjaVE4HwLTkOATLIzIxg92s-QCWKE1RdNlaZsxPnfN7NHPGAZz5rsmxvpNDY; 
				QN44=qunar_lbs_428305502; QN48=tc_a7fe4861b2d918df_17028369fc8_67ab; QN271=1749d44a-1a11-4886-be27-c3e3bfdadb0c'''
	cookies_lst = cookies.split("; ")
	for i in cookies_lst:
		dic_cookies[i.split("=")[0]] = i.split("=")[1]

	citylist,code=citycode.get_city_id()
	citylist_=['北京','上海','重庆','天津','厦门','福州','泉州','南平','宁德','杭州','宁波','台州','湖州','南京',
			   '苏州','扬州','无锡','连云港','南通','青岛','济南','合肥','黄山','大连','沈阳','石家庄','哈尔滨',
			   '太原','呼和浩特','呼伦贝尔','长春','三亚','海口','广州','深圳','珠海','长沙','武汉','宜昌','成都',
			   '昆明','大理','宝鸡','拉萨','兰州','乌鲁木齐','遵义','银川'] # 需要得到景点信息的城市列表

	# 采集数据
	datalst = []
	errorlst = []
	for item in citylist_:
		key=code[item]
		for u in get_url(key,15):
			try:
				ua = UserAgent(verify_ssl=False)
				dic_heders = {"User-Agent": ua.random} # 随机使用用户代理，防止被反爬
				datalst.extend(get_data(u, dic_heders, dic_cookies,item))
				print('数据采集成功，共采集数据{}条'.format(len(datalst)))
			except:
				errorlst.append(u)
				print('数据采集失败，网址为：', u)

	# 对数据进行规范化处理
	df = pd.DataFrame(datalst)
	df['经度'] = df['经度'].astype('float')
	df['纬度'] = df['纬度'].astype('float')
	df['点评数量'] = df['点评数量'].astype('int')
	df['攻略数量'] = df['攻略数量'].astype('int')
	df['评分'] = df['评分'].str.split(":").str[-1].str.replace("%", "").astype("float")
	df['多少驴友来过'] = df['多少驴友来过'].str.replace("%", "").astype('float') / 100
	df['排名'] = df[df['排名'] != ""]['排名'].str.split("第").str[-1].astype('int')

	# 将数据存入Excel
	df.to_excel('去哪儿网数据爬取.xlsx', index=True)